# load "Ask a A Manager 2021 Survey" googlesheet
# https://www.askamanager.org/
ask_a_manager_2021 <- read_csv(here::here("data","ask_a_manager.csv"))
skimr::skim(ask_a_manager_2021)
| Name | ask_a_manager_2021 |
| Number of rows | 26765 |
| Number of columns | 18 |
| _______________________ | |
| Column type frequency: | |
| character | 14 |
| logical | 2 |
| numeric | 1 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| how_old_are_you | 0 | 1.00 | 5 | 10 | 0 | 7 | 0 |
| industry | 62 | 1.00 | 2 | 171 | 0 | 1084 | 0 |
| job_title | 0 | 1.00 | 1 | 126 | 0 | 12838 | 0 |
| additional_context_on_job_title | 19868 | 0.26 | 1 | 781 | 0 | 6612 | 0 |
| currency | 0 | 1.00 | 3 | 7 | 0 | 11 | 0 |
| additional_context_on_income | 23851 | 0.11 | 1 | 1143 | 0 | 2839 | 0 |
| country | 0 | 1.00 | 1 | 209 | 0 | 297 | 0 |
| state | 4761 | 0.82 | 4 | 114 | 0 | 125 | 0 |
| city | 23 | 1.00 | 1 | 171 | 0 | 4070 | 0 |
| overall_years_of_professional_experience | 0 | 1.00 | 9 | 16 | 0 | 8 | 0 |
| years_of_experience_in_field | 0 | 1.00 | 9 | 16 | 0 | 8 | 0 |
| highest_level_of_education_completed | 202 | 0.99 | 3 | 34 | 0 | 6 | 0 |
| gender | 155 | 0.99 | 3 | 29 | 0 | 5 | 0 |
| race | 151 | 0.99 | 5 | 125 | 0 | 47 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| other_monetary_comp | 26765 | 0 | NaN | : |
| currency_other | 26765 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| annual_salary | 0 | 1 | 144988 | 5488158 | 0 | 54000 | 75712 | 110000 | 8.7e+08 | ▇▁▁▁▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| timestamp | 0 | 1 | 2021-04-27 11:02:09 | 2021-09-14 21:55:44 | 2021-04-28 12:35:21 | 23989 |
After skimming the dataset, we took the below steps to clean the raw data :
# remove the NA values
ask_a_manager_2021 <- ask_a_manager_2021 %>%
filter(!gender %in% c("NA","Other or prefer not to answer","Prefer not to answer", "Non-binary", NA) &
race!="NA" &
highest_level_of_education_completed!="NA")
# using z-score. remove data out of 3 sigma
ask_a_manager_2021 <- ask_a_manager_2021[-which(abs(scale(ask_a_manager_2021$annual_salary))>3),]
# using quantile. remove data larger than 75%+1.5*IQR or smaller than 25%-1.5*IQR
# Q <- quantile(ask_a_manager_2021$salary, probs=c(.25, .75))
# iqr <- IQR(ask_a_manager_2021$salary)
# up <- Q[2]+1.5*iqr # Upper Range
# up <- Q[1]-1.5*iqr # Upper Range
# ask_a_manager_2021 <- ask_a_manager_2021 %>%
# filter(salary>=low & salary<=up)
factor datatypeNamely, we want to make age, years of working experience, education into factor, and take the log of salary for the next step of EDA.
ask_a_manager_2021 <- ask_a_manager_2021 %>%
mutate(
country = countrycode::countryname(country),
age_group = factor(how_old_are_you,levels=c("under 18","18-24","25-34","35-44","45-54","55-64","65 or over")),
field_exp = factor(years_of_experience_in_field,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")),
pro_exp = factor(overall_years_of_professional_experience,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")),
education = factor(highest_level_of_education_completed, levels=c("High School","Some college","College degree","Master's degree","Professional degree (MD, JD, etc.)", "PhD")),
higher_edu = ifelse(education %in% c("College degree","Master's degree", "PhD", "Professional degree (MD, JD, etc.)"),1,0),
salary = annual_salary,
log_salary = log(annual_salary)
) %>%
janitor::clean_names() # clean columns names
Since there are over 1000 reported industries in the dataset, we will only keep the top 25 industries which account for 92% of total entries
# Industry is messy... it has > 1000 different industries.
# we only keep the names of top 25 industries. (24597(92%) out of 26765 entries)
industry_list <- ask_a_manager_2021 %>%
count(industry, sort=TRUE) %>%
mutate(percent = 100* n/sum(n)) %>%
slice_max(order_by = n, n = 25) %>%
select(industry)
ask_a_manager_2021 <- ask_a_manager_2021 %>%
mutate(industry = ifelse(industry %in% industry_list$industry,
industry,
"Others"))
Since there are 11 different currencies reported in the dataset, we want to unify the most common currencies into USD for the convenience of comparison analysis afterwards. We decided to convert all currencies with occurences greater than 100 into USD. This included CAD,GBP,EUR,AUD/NZD. Other was not converted as there wasn’t specific information on the currency.
ask_a_manager_2021 %>%
count(currency, sort=TRUE) %>%
mutate(percent = 100* n/sum(n))
| currency | n | percent |
|---|---|---|
| USD | 21188 | 83.6 |
| CAD | 1492 | 5.88 |
| GBP | 1442 | 5.69 |
| EUR | 558 | 2.2 |
| AUD/NZD | 456 | 1.8 |
| Other | 123 | 0.485 |
| CHF | 32 | 0.126 |
| SEK | 32 | 0.126 |
| JPY | 17 | 0.067 |
| ZAR | 13 | 0.0513 |
| HKD | 4 | 0.0158 |
From the above table, We can see that USD, CAD, GBP, EUR, (AUD/NZD) are the most common currencies. We will convert the currencies all to USD using exchange rate data from library quantmod, and the exchange rate will be taken from the last date of the survey “2021-09-14”
library(quantmod)
#library to get currency data (and other financial data)
#gets the currency rate for the currency pairs. This uses the Oanda which keeps currency data for the last 180 days.
#In 180 days from "2021-09-14" the code will no longer return the currency rates.
currencies <-getSymbols(c("EUR/USD","CAD/USD","GBP/USD","AUD/USD" , "NZD/USD"),
from="2021-09-14", to="2021-09-14", src="oanda")
#In order to preserve the analysis for after 180 days we can hard code the currency rates into a tibble
currencies_preserved <- tibble(
name = c("EUR/USD","CAD/USD","GBP/USD","AUD/USD" , "NZD/USD"),
value = c(1.18,0.79,1.38,0.734,0.711)
)
Here we create a new column called converted_salary which converts the 5 most common currencies in USD. These are then rounded to the nearest whole number.
#the values are taken from the currencies_preserved$value as to preserve the analysis when
# getSymbols will no longer work. The index matches the currency pair.
ask_a_manager_2021 <- ask_a_manager_2021 %>%
mutate(converted_salary = case_when(
currency == "GBP" ~ round(annual_salary*currencies_preserved$value[3]),
currency == "AUD/NZD" & country == "Australia" ~ round(annual_salary*currencies_preserved$value[4]),
currency == "AUD/NZD" & country == "New Zealand" ~ round(annual_salary*currencies_preserved$value[5]),
currency == "CAD" ~ round(annual_salary*currencies_preserved$value[2]),
currency == "EUR" ~ round(annual_salary*currencies_preserved$value[1]),
TRUE ~ round(annual_salary)
))
#Here we filter the dataset to only include the salaries that were converted into USD terms. The other currencies also included lots of outliers which we also get rid of here.
ask_a_manager_2021 <- ask_a_manager_2021 %>%
filter(currency %in% c("USD","GBP","AUD/NZD","EUR","CAD") &
!is.na(country))
ask_a_manager_2021 %>%
count(currency, sort=TRUE) %>%
mutate(percent = 100* n/sum(n))
| currency | n | percent |
|---|---|---|
| USD | 21042 | 84.5 |
| CAD | 1482 | 5.95 |
| GBP | 1370 | 5.5 |
| EUR | 554 | 2.22 |
| AUD/NZD | 451 | 1.81 |
# Some quick counts, groups, etc
ask_a_manager_2021 %>%
count(age_group, sort=TRUE) %>%
mutate(percent = 100* n/sum(n)) %>%
ggplot(aes(x=fct_relevel(age_group,levels=c("under 18","18-24","25-34","35-44","45-54","55-64","65 or over")), y=n)) +
geom_bar(stat="identity") + labs(title = "Age distribution of Ask a Manager survey respondents", x="Age group",
y="Frequency", caption = "Source:Askamanager.com")
# 'country'
ask_a_manager_2021 %>%
count(country, sort=TRUE) %>%
mutate(percent = 100* n/sum(n))
# 'city'
ask_a_manager_2021 %>%
count(city, sort=TRUE) %>%
mutate(percent = 100* n/sum(n))
# education
ask_a_manager_2021 %>%
count(education) %>%
mutate(percent = 100* n/sum(n))%>%
arrange(desc(percent))
# gender
ask_a_manager_2021 %>%
count(gender) %>%
mutate(percent = 100* n/sum(n))%>%
arrange(desc(percent))
# race
ask_a_manager_2021 %>%
count(race) %>%
mutate(percent = 100* n/sum(n))%>%
arrange(desc(percent))
# pro_exp
ask_a_manager_2021 %>%
count(pro_exp ) %>%
mutate(percent = 100* n/sum(n))%>%
arrange(desc(percent))
# field_exp
ask_a_manager_2021 %>%
count(field_exp ) %>%
mutate(percent = 100* n/sum(n))%>%
arrange(desc(percent))
#make comments about the counts
How is salary distributed?
favstats(ask_a_manager_2021$salary) # find a really rich guy here
| min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|
| 0 | 5.5e+04 | 7.6e+04 | 1.09e+05 | 5e+06 | 8.93e+04 | 7.07e+04 | 24899 | 0 |
# density
g1 <- ggplot(ask_a_manager_2021, aes(x=salary))+
geom_density()+
NULL
#cdf
g2 <- ask_a_manager_2021 %>%
slice_min(order_by = salary,n =nrow(ask_a_manager_2021)-5) %>% # choose a number
ggplot(aes(x=salary))+
stat_ecdf()+
NULL
# taking log is better
g3 <- ggplot(ask_a_manager_2021, aes(x=log(salary)))+
geom_density()+
NULL
g4 <- ask_a_manager_2021 %>%
slice_min(order_by = salary,n =nrow(ask_a_manager_2021)-5) %>% # choose a number
ggplot(aes(x=log(salary)))+
stat_ecdf()+
NULL
gridExtra::grid.arrange(g1,g2,g3,g4)
Observation: This is a right skewed distribution.
ask_a_manager_2021 %>%
count(industry, sort=TRUE) %>%
ggplot(aes(y=fct_reorder(industry,n), x=n))+
geom_col()
Observations:
We are interested in the type of industries with top salary for man and woman.
# Industries of top average salary for Man and Woman
salary_table <- ask_a_manager_2021 %>%
group_by(industry, gender) %>%
summarise(count = n(),
avg_salary = mean(converted_salary),
higher_edu_prop = sum(higher_edu)/n()) %>%
pivot_wider(id_cols = 1:5,
names_from = gender,
values_from = count:higher_edu_prop)
g_man <- salary_table %>%
ggplot(aes(y=fct_reorder(industry,avg_salary_Man), x=avg_salary_Man,fill=higher_edu_prop_Man))+
geom_bar(position="dodge", stat="identity")+
geom_text(aes(label=paste(avg_salary_Man%/%1000,"k")),
position=position_dodge(width=0.9), hjust=0,vjust=0)+
scale_x_continuous(labels = scales::comma) +
scale_fill_gradient(low="sky blue", high="blue")+
labs(
title="Man",
x="Annual Salary $",
y="Industry",
fill="Higher Education %"
)+
theme_wsj()+
theme(legend.position="bottom",
legend.title=element_text(size=13))+
NULL
# ggsave("highest_paid_industries_male.png",width = 12,height = 8,limitsize = F)
g_woman <- salary_table %>%
ggplot(aes(y=fct_reorder(industry,avg_salary_Woman), x=avg_salary_Woman,fill=higher_edu_prop_Woman))+
geom_bar( position="dodge", stat="identity")+
geom_text(aes(label=paste(avg_salary_Woman%/%1000,"k")),
position=position_dodge(width=0.9), hjust=0,vjust=0)+
scale_x_continuous(labels = scales::comma) +
scale_fill_gradient(low="pink", high="red")+
labs(
title="Woman",
x="Annual Salary $",
y="Industry",
fill="Higher Education %"
)+
theme_wsj()+
theme(legend.position="bottom",
legend.title=element_text(size=13))+
NULL
# ggsave("highest_paid_industries_female.png",width = 12,height = 8,limitsize = F)
Observations:
We are also interested in the salary difference between man and woman within the same industry.
# salary gap in different industries
ask_a_manager_2021 %>%
group_by(industry, gender) %>%
summarise(count = n(),
avg_salary = mean(converted_salary)) %>%
pivot_wider(id_cols = 1:4,
names_from = gender,
values_from = count:avg_salary) %>%
mutate(size = count_Man+count_Woman,
male_prop = count_Man/(count_Man+count_Woman),
salary_gap = avg_salary_Man - avg_salary_Woman) %>%
ggplot(aes(y=fct_reorder(industry,salary_gap), x=salary_gap,fill=male_prop))+
geom_col()+
# geom_point(aes(y=fct_reorder(industry,size),x=size*10))+
scale_x_continuous(
breaks = seq(-20000,40000,5000)
) +
scale_fill_gradient(low="pink", high="blue")+
labs(
title="Salary Gap and Sex Ratio",
x="Salary Gap in $",
y="",
fill="Male %"
)+
theme_wsj()+
theme(legend.position="bottom",
legend.title=element_text(size=13))+
NULL
# ggsave("salary_gap.png",width = 12,height = 8,limitsize = F)
Observations:
Next, we want to see if there is a difference in the salary gap between man and woman for different education level.
ask_a_manager_2021 %>%
group_by(education, gender) %>%
summarise(count = n(),
avg_salary = mean(converted_salary)) %>%
pivot_wider(id_cols = 1:4,
names_from = gender,
values_from = count:avg_salary) %>%
mutate(size = count_Man+count_Woman,
male_prop = count_Man/(count_Man+count_Woman),
salary_gap = avg_salary_Man - avg_salary_Woman) %>%
ggplot(aes(x=education, y=salary_gap,fill=male_prop))+
geom_col()+
scale_y_continuous(
breaks = seq(-20000,40000,5000)
) +
scale_fill_gradient(low="pink", high="sky blue")+
theme_wsj()+
theme(legend.position="bottom", axis.title=element_text(size=24))+
ggtitle("Salary Gap at Different Education level") +
xlab("Education Level") + ylab ("Salary Gap in $") +
NULL
# ggsave("salary_gap_by_edu.png",width = 14,height = 8,limitsize = F)
ask_a_manager_2021 %>%
group_by(race) %>%
filter(n() > 500) %>%
summarise(count=n()) %>%
ggplot(aes(y=count, x=race, fill=race))+
geom_col() +
theme_wsj()+
theme(legend.position = "none") +
labs(x="Race", y="Count", title="Count by Race", subtitle = "Races with > 500 Respondents")+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
NULL
ask_a_manager_2021 %>%
filter(industry %in% industry_list$industry) %>%
group_by(race) %>%
filter(n() > 500) %>%
ggplot( aes(x=race, y=converted_salary, color=race)) +
geom_jitter(color="black", size=0.01, alpha=0.9) +
geom_boxplot(alpha=0.4) +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
geom_hline(yintercept=144988, color="red") +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
theme_wsj() +
theme(legend.position = "none")+
theme(axis.title=element_text(size=10))+
ggtitle("Salary Vs. Race Boxplot") +
ylab("Salary") + xlab ("Race") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
scale_y_continuous(labels = scales::comma, limits = c(0, 300000)) +
NULL
Observation:
In our boxplot one can visualize both the distribution of the data and the medium and interquartile ranges of the data. When comparing annual salary by race, our data shows that Asians have the higher salaries in comparison to Black and African American and White sample respondents. Interestingly enough, our sample also shows that the medium salary for Black and AA respondents is higher than that of White respondents.
Since most repondents were white, the larger sample size is less prone to skewness due to outliers when comparing it to the other two races portrayed in the plot.
ask_a_manager_2021_race <- ask_a_manager_2021 %>%
group_by(race) %>%
filter(race %in% c("Black or African American", "White"))
t.test(salary ~ race, data = ask_a_manager_2021_race)
##
## Welch Two Sample t-test
##
## data: salary by race
## t = 1, df = 605, p-value = 0.3
## alternative hypothesis: true difference in means between group Black or African American and group White is not equal to 0
## 95 percent confidence interval:
## -6923 26250
## sample estimates:
## mean in group Black or African American mean in group White
## 97570 87906
When running a T-test on the two racial groups, White and Black or African American, the P-value we get is 0.9. Thus there is no statistically signifcant difference in their annual salary when calculating it based of off this data set.
#This package is useful for creating a map of the USA
library(usmap)
library(ggrepel)
#create df of the list of states with >10 occurrences
#This allows us to get data for actual states as many data entry errors
#many individuals put multiple states and this helps to fix that
states <- ask_a_manager_2021 %>%
group_by(state) %>%
filter(!is.na(state)) %>%
filter(n()>10) %>%
summarise(mean_salary = mean(salary),
count_state = count(state))
states$fips <- fips(states$state)
statesalarymap <- states %>% select(state,mean_salary)
plot_usmap(data=statesalarymap, values="mean_salary", color="black") + scale_fill_continuous(
low = "white",
high = "light green",
name = "Mean Salary") +
theme(legend.position = "right") +
labs(title="Map of US States based on Average Salary")
MAP
# Count by Industry
ask_a_manager_2021 %>%
group_by(industry) %>%
summarise(count=n()) %>%
ggplot(aes(y=fct_reorder(industry,count),x=count))+
geom_col()
# null hypothesis: no difference between
ask_a_manager_2021 %>%
group_by(age_group, gender) %>%
summarise(count=n())
## # A tibble: 13 x 3
## # Groups: age_group [7]
## age_group gender count
## <fct> <chr> <int>
## 1 under 18 Woman 6
## 2 18-24 Man 168
## 3 18-24 Woman 759
## 4 25-34 Man 1853
## 5 25-34 Woman 9155
## 6 35-44 Man 1907
## 7 35-44 Woman 7110
## 8 45-54 Man 616
## 9 45-54 Woman 2327
## 10 55-64 Man 156
## 11 55-64 Woman 754
## 12 65 or over Man 17
## 13 65 or over Woman 71
#ggplot(aes(y=fct_reorder(how_old_are_you,count),x=count))+
#geom_col()
# Count by Age Group
ask_a_manager_2021 %>%
group_by(age_group) %>%
summarise(count=n()) %>%
ggplot(aes(y=fct_reorder(age_group,count),x=count))+
geom_col()
# Sorting out the people who are 18+
ask_a_manager_age <- ask_a_manager_2021 %>%
mutate(factor(age_group,levels=c("18-24","25-34","35-44","45-54","55-64","65 or over")))
#need to change colors for male and female ?!
#there are only 6 women in under 18 category so excluding them for the graph
# since there is no male data in that category to compare
ask_a_manager_age %>%
filter(age_group %in% c("18-24", "25-34" , "35-44" , "45-54" , "55-64" )) %>%
group_by(age_group, gender) %>%
summarise (median_salary_by_age = median (salary),
mean_salary_by_age = mean(salary)) %>%
ggplot () +
geom_col(aes (age_group, median_salary_by_age, fill = gender), position = position_dodge(width = 0.9), alpha = 0.7)+
geom_point(aes(x = age_group, y=mean_salary_by_age, color = gender))+
theme_bw()+
#formatC(x, format = "d")+
labs( title = "Median Salary by Age Group" , y="Salary" ,x = "Age Group") +
NULL
ask_a_manager_2021_changed <- ask_a_manager_2021 %>%
mutate(if_changed= if_else(overall_years_of_professional_experience == years_of_experience_in_field, "No", "Yes"))
ask_a_manager_2021_changed %>%
ggplot(aes(x=fct_relevel(overall_years_of_professional_experience,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")), y=annual_salary, color=if_changed))+
# scatter plot
geom_point(alpha = 0.5)+
# linear smooth line
geom_smooth(method = "lm")+
# legend settings
theme(legend.position = "bottom",
legend.title = element_blank())+
labs(title="Whether changing job affects your salary",
x="Total Years Spent Working",
y="Salary")+
theme_bw()+
NULL
ask_a_manager_2021_changed %>%
group_by(overall_years_of_professional_experience, if_changed) %>%
summarise (median_salary_over_time = median (salary),
mean_salary_over_time = mean(salary)) %>%
ggplot () +
geom_col(aes (fct_relevel(overall_years_of_professional_experience,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")), median_salary_over_time, fill=if_changed), position = position_dodge(width = 0.9), alpha = 0.7)+
geom_point(aes(x = overall_years_of_professional_experience, y=mean_salary_over_time, color = if_changed))+
theme_bw()+
#formatC(x, format = "d")+
labs( title = "Median Salary over time" , y="Salary" ,x = "Prof") +
NULL
ggsave("switch_industry.png",width = 12,height = 8,limitsize = F)
mosaic::favstats (converted_salary ~ gender, data=ask_a_manager_2021)
| gender | min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|---|
| Man | 0 | 6.5e+04 | 9.6e+04 | 1.41e+05 | 2.11e+06 | 1.12e+05 | 8.33e+04 | 4717 | 0 |
| Woman | 0 | 5.3e+04 | 7.2e+04 | 1e+05 | 5e+06 | 8.38e+04 | 6.63e+04 | 20182 | 0 |
ask_a_manager_2021 %>%
ggplot( aes(x=gender, y=converted_salary, color=gender)) +
geom_jitter(color="black", size=0.4, alpha=0.9) +
geom_boxplot(alpha=0.3) +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme_wsj() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
theme(axis.title=element_text(size=12))+
ggtitle("Salary Vs. Gender Boxplot with Jitter") +
ylab("Salary") + xlab ("Race") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
scale_y_continuous(labels = scales::comma, limits = c(0, 1000000)) +
NULL
For our following analysis, we will be comparing data from the male gender “Man” with data from the female gender “Woman.” We will filter out the data on those that are non binary.
ask_a_manager_2021 <- ask_a_manager_2021 %>%
filter(gender!="Non-binary")
t.test(converted_salary ~ gender, data = ask_a_manager_2021)
##
## Welch Two Sample t-test
##
## data: converted_salary by gender
## t = 21, df = 6183, p-value <2e-16
## alternative hypothesis: true difference in means between group Man and group Woman is not equal to 0
## 95 percent confidence interval:
## 25223 30318
## sample estimates:
## mean in group Man mean in group Woman
## 111551 83781
#mosaic::favstats (annual_salary ~ gender, data=ask_a_manager_2021)
ask_a_manager_2021_gender <- ask_a_manager_2021 %>%
mutate(gender2 = case_when(
gender == "Man" ~ "1",
gender == "Woman" ~ "0",
TRUE ~ "NA"
)) %>%
mutate(gender2 = as.numeric(gender2))
mosaic::favstats (converted_salary ~ gender, data=ask_a_manager_2021_gender) %>%
mutate(t_critical = qt(0.975,n-1),
sd_mean = sd/sqrt(n),
margin_of_error = t_critical*sd_mean,
ci_lower = mean-margin_of_error,
ci_higher = mean+margin_of_error)
| gender | min | Q1 | median | Q3 | max | mean | sd | n | missing | t_critical | sd_mean | margin_of_error | ci_lower | ci_higher |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Man | 0 | 6.5e+04 | 9.6e+04 | 1.41e+05 | 2.11e+06 | 1.12e+05 | 8.33e+04 | 4717 | 0 | 1.96 | 1.21e+03 | 2.38e+03 | 1.09e+05 | 1.14e+05 |
| Woman | 0 | 5.3e+04 | 7.2e+04 | 1e+05 | 5e+06 | 8.38e+04 | 6.63e+04 | 20182 | 0 | 1.96 | 466 | 914 | 8.29e+04 | 8.47e+04 |
# Add graph of two distributions and confidence intervals
Observation:
# hypothesis testing using infer package
diff <- ask_a_manager_2021_gender %>%
specify(converted_salary ~ gender) %>%
calculate(stat = "diff in means", order = c("Woman", "Man"))
set.seed(1234)
salary_in_null_world <- ask_a_manager_2021_gender %>%
filter(gender !="Non-binary") %>%
specify(converted_salary ~ gender) %>%
# assuming independence of gender and salary
hypothesize(null = "independence") %>%
# generate 1000 reps, of type "permute"
generate(reps = 1000, type = "permute") %>%
# calculate statistic of difference, namely "diff in means"
calculate(stat = "diff in means", order = c("Woman", "Man"))
salary_in_null_world %>%
visualise()+
shade_p_value(obs_stat = diff, direction = "two-sided")
p_value <- salary_in_null_world %>%
get_pvalue(obs_stat = diff, direction="both")
p_value
| p_value |
|---|
| 0 |
Observation:
We want to look at the relationship between one’s gender and education degree. In essence, we want to explore if there is a difference in the proportion of respondents with high school degree as their highest degree for male group and the female group. We can use the proportion test to generate the result from the male and female sample.
ask_a_manager_2021_education <- ask_a_manager_2021 %>%
filter(gender !="Non-binary") %>%
mutate(below_college = if_else(highest_level_of_education_completed %in% c("High School", "Some college"), "Yes", "No")) %>%
group_by(gender) %>%
summarise(college_below=sum(below_college == "Yes"), college_or_above = sum(below_college == "No"))
ask_a_manager_2021_education
| gender | college_below | college_or_above |
|---|---|---|
| Man | 687 | 4030 |
| Woman | 1598 | 18584 |
prop.test(x = c(4030,18584), n = c(4030+687,18584+1598), alternative = "two.sided")
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c out of c4030 out of 4030 + 68718584 out of 18584 + 1598
## X-squared = 202, df = 1, p-value <2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.0773 -0.0556
## sample estimates:
## prop 1 prop 2
## 0.854 0.921
Observations:
If we repeat the process to explore the difference in proportion in acquiring a masters or above degree for both gender:
ask_a_manager_2021_education <- ask_a_manager_2021 %>%
filter(gender !="Non-binary") %>%
mutate(higher_edu = if_else(highest_level_of_education_completed %in% c("Master's degree", "Professional degree (MD, JD, etc.)", "PhD"), "Yes", "No")) %>%
group_by(gender) %>%
summarise(master_below = sum(higher_edu == "No"), master_or_above=sum(higher_edu == "Yes") )
ask_a_manager_2021_education
| gender | master_below | master_or_above |
|---|---|---|
| Man | 3063 | 1654 |
| Woman | 11260 | 8922 |
prop.test(x = c(1654, 8922), n = c(1654+3063, 8922+11260), alternative = "two.sided")
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c out of c1654 out of 1654 + 30638922 out of 8922 + 11260
## X-squared = 130, df = 1, p-value <2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.1068 -0.0761
## sample estimates:
## prop 1 prop 2
## 0.351 0.442
Observations:
Here we want to see if race is affecting the salary.
ask_a_manager_salary <- ask_a_manager_2021 %>%
mutate(if_white = if_else(race == "White", "Yes", "No")) %>%
select(if_white,salary)
t.test(salary~if_white, data = ask_a_manager_salary, success = "Yes")
##
## Welch Two Sample t-test
##
## data: salary by if_white
## t = 5, df = 4313, p-value = 6e-07
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## 5472 12543
## sample estimates:
## mean in group No mean in group Yes
## 96913 87906
We want to look at the relationship between one’s race and education degree. Particularly, we want to see if there is a difference in education level for those being white and those being non-white.
As the above analysis for gender vs. education, we run two proportion tests for college-or-above degree percentage and master-or-above percentage between the male and female group.
ask_a_manager_2021_race <- ask_a_manager_2021 %>%
mutate(if_white = if_else(race == "White", "White", "Non-white"),
below_college = if_else(highest_level_of_education_completed %in% c( "High School", "Some college"), "Yes", "No")) %>%
group_by(if_white) %>%
summarise(college_below=sum(below_college == "Yes"), college_or_above = sum(below_college == "No"))
ask_a_manager_2021_race
| if_white | college_below | college_or_above |
|---|---|---|
| Non-white | 350 | 3510 |
| White | 1935 | 19104 |
prop.test(x = c(3510,19104), n = c(3510+350, 19104+1935), alternative = "two.sided")
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c out of c3510 out of 3510 + 35019104 out of 19104 + 1935
## X-squared = 0.05, df = 1, p-value = 0.8
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.00872 0.01132
## sample estimates:
## prop 1 prop 2
## 0.909 0.908
Similarly, if we run the test for the propotion of master_or_above education level between white and non-white races:
ask_a_manager_2021_race <- ask_a_manager_2021 %>%
mutate(if_white = if_else(race == "White", "White", "Non-white"),
higher_edu = if_else(highest_level_of_education_completed %in% c("Master's degree", "Professional degree (MD, JD, etc.)", "PhD"), "Yes", "No")) %>%
group_by(if_white) %>%
summarise(master_below = sum(higher_edu == "No"), master_or_above=sum(higher_edu == "Yes") )
ask_a_manager_2021_race
| if_white | master_below | master_or_above |
|---|---|---|
| Non-white | 2347 | 1513 |
| White | 11976 | 9063 |
prop.test(x = c(1513, 9063), n = c(1513+2347, 9063+11976), alternative = "two.sided")
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c out of c1513 out of 1513 + 23479063 out of 9063 + 11976
## X-squared = 20, df = 1, p-value = 8e-06
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.0557 -0.0219
## sample estimates:
## prop 1 prop 2
## 0.392 0.431
Observations:
From the result of the proportion test, we cannot see a clear difference between the proportion of education level in white and non-white group, as the p-value is higher than 5%.
However, the proportion of acquiring higher education degree (Master’s-or-above) is higher in the white group vs. other races
t.test(salary~if_changed, data = ask_a_manager_2021_changed)
##
## Welch Two Sample t-test
##
## data: salary by if_changed
## t = 15, df = 24238, p-value <2e-16
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## 11569 15074
## sample estimates:
## mean in group No mean in group Yes
## 95369 82047
prop.test(~if_changed, data = ask_a_manager_2021_changed, success = "Yes")
##
## 1-sample proportions test with continuity correction
##
## data: ask_a_manager_2021_changed$if_changed [with success = Yes]
## X-squared = 198, df = 1, p-value <2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.449 0.462
## sample estimates:
## p
## 0.455
ask_a_manager_age %>%
filter(age_group %in% c("18-24", "25-34" , "35-44" , "45-54" , "55-64" )) %>%
group_by(age_group, gender) %>%
summarise (median_salary_by_age = median (salary),
mean_salary_by_age = mean(salary)) %>%
ggplot () +
geom_col(aes (age_group, median_salary_by_age, fill = gender), position = position_dodge(width = 0.9), alpha = 0.7)+
geom_point(aes(x = age_group, y=mean_salary_by_age, color = gender))+
theme_bw()+
#formatC(x, format = "d")+
labs( title = "Median Salary by Age Group" , y="Salary" ,x = "Age Group") +
NULL
colnames(ask_a_manager_2021)
reg_data <- ask_a_manager_2021 %>%
mutate(country = ifelse(country=="US","US","Non-US"))
m1 <- lm(converted_salary ~ education, data=reg_data)
m2 <- lm(converted_salary ~ education + age_group, data=reg_data)
m3 <- lm(converted_salary ~ education + gender, data=reg_data)
m4 <- lm(converted_salary ~ education + age_group + gender + pro_exp + field_exp, data=reg_data)
m5 <- lm(converted_salary ~ education + age_group + gender + field_exp, data=reg_data)
huxreg(m1, m2, m3, m4, m5,
statistics = c('#observations' = 'nobs',
'R squared' = 'r.squared',
'Adj. R Squared' = 'adj.r.squared',
'Residual SE' = 'sigma'),
bold_signif = 0.05,
stars = NULL
) %>%
set_caption('Comparison of models')
| (1) | (2) | (3) | (4) | (5) | |
|---|---|---|---|---|---|
| (Intercept) | 73753.850 | 62856.643 | 92165.971 | 70905.720 | 65882.024 |
| (3031.177) | (28236.014) | (3065.946) | (27781.746) | (27527.453) | |
| educationSome college | 1146.369 | 178.194 | 4490.817 | 4192.416 | 4395.104 |
| (3456.755) | (3426.821) | (3410.103) | (3333.414) | (3333.739) | |
| educationCollege degree | 10936.411 | 14478.221 | 16607.004 | 20368.221 | 20329.478 |
| (3096.939) | (3076.147) | (3060.401) | (3001.029) | (2997.480) | |
| educationMaster's degree | 16560.585 | 17400.650 | 23572.779 | 24328.428 | 24326.172 |
| (3128.923) | (3106.455) | (3095.672) | (3036.162) | (3030.978) | |
| educationProfessional degree (MD, JD, etc.) | 60645.762 | 60842.097 | 68008.053 | 70035.175 | 69643.884 |
| (3625.164) | (3597.836) | (3584.357) | (3518.621) | (3508.744) | |
| educationPhD | 30306.472 | 29312.578 | 36070.156 | 39374.432 | 38793.181 |
| (3597.624) | (3568.901) | (3553.194) | (3498.655) | (3478.807) | |
| age_group18-24 | -17669.678 | -9879.743 | -9267.739 | ||
| (28271.961) | (27567.347) | (27491.310) | |||
| age_group25-34 | 1111.921 | -3876.079 | -2008.487 | ||
| (28188.442) | (27463.620) | (27408.375) | |||
| age_group35-44 | 15983.686 | -5471.914 | -1500.703 | ||
| (28187.758) | (27457.952) | (27410.485) | |||
| age_group45-54 | 20370.007 | -11070.008 | -7739.091 | ||
| (28206.456) | (27461.839) | (27431.085) | |||
| age_group55-64 | 25471.862 | -8051.878 | -6100.110 | ||
| (28269.960) | (27527.157) | (27518.041) | |||
| age_group65 or over | 17678.366 | -14583.331 | -14958.135 | ||
| (29120.832) | (28447.729) | (28432.793) | |||
| genderWoman | -30004.937 | -26329.654 | -26229.759 | ||
| (1117.505) | (1099.665) | (1099.280) | |||
| pro_exp2 - 4 years | -7805.873 | ||||
| (4204.339) | |||||
| pro_exp5-7 years | -5300.576 | ||||
| (4351.911) | |||||
| pro_exp8 - 10 years | -1392.495 | ||||
| (4392.389) | |||||
| pro_exp11 - 20 years | 337.324 | ||||
| (4472.274) | |||||
| pro_exp21 - 30 years | -1275.784 | ||||
| (4821.102) | |||||
| pro_exp31 - 40 years | -2767.468 | ||||
| (5781.403) | |||||
| pro_exp41 years or more | -12687.960 | ||||
| (9105.571) | |||||
| field_exp2 - 4 years | 8411.615 | 6500.929 | |||
| (2505.791) | (2225.984) | ||||
| field_exp5-7 years | 17208.320 | 16989.085 | |||
| (2596.075) | (2275.486) | ||||
| field_exp8 - 10 years | 25127.188 | 26530.695 | |||
| (2694.501) | (2345.796) | ||||
| field_exp11 - 20 years | 36182.740 | 37489.263 | |||
| (2725.230) | (2381.517) | ||||
| field_exp21 - 30 years | 57087.061 | 57240.743 | |||
| (3327.637) | (2920.774) | ||||
| field_exp31 - 40 years | 51244.416 | 50426.815 | |||
| (5212.230) | (4654.659) | ||||
| field_exp41 years or more | 49007.992 | 42568.136 | |||
| (12596.801) | (11634.336) | ||||
| #observations | 24899 | 24899 | 24899 | 24899 | 24899 |
| R squared | 0.028 | 0.047 | 0.056 | 0.101 | 0.100 |
| Adj. R Squared | 0.028 | 0.047 | 0.056 | 0.100 | 0.099 |
| Residual SE | 69651.152 | 68979.521 | 68665.310 | 67032.046 | 67051.976 |